Set Up

  • Install/load necessary R packages
  • Set working directory if necessary (or create a file path to use throughout the RMD to call the data from Box)

S2 Stripchart Streamflow Data

This RMD is the first in the workflow and reads in, cleans, and visualizes the stripchart stream height data for the 2017 - 2021 water years at the S2 bog site.

Water Year 2017

  • Read in the WY 2017 data from the External-MEF_DATA Box folder
    • Specifically, raw data for the S2 Bog site is found at the following file path: External-MEF_DATA/Hydro/Streamflow/L0_subdaily/StripCharts/AnnualBreakPoint/S2
  • Clean and manipulate the WY 2017 data into a tidy format
    • Rename columns to use lower_snake_case and avoid special characters
    • Create a new date column that follows the format YYYY-MM-DD (and does not include a timestamp) for ease of sorting/manipulating dates
  • Test that the manipulated data frame has the same number of rows as the original data frame (we do not want to accidentally lose data, so the code will throw an error is the two data frames do not match)
  • Plot the WY 2017 data
#create file path to call the data from Box 
## Mia's file path 
filepath <- "/Users/miaforsline/Library/CloudStorage/Box-Box/External-MEF_DATA/Hydro/Streamflow/L0_subdaily/StripCharts/AnnualBreakPoint/S2"

#read in the 2017 data 
wy2017 <- read_excel(path = here(filepath, "WY2017.S2_Breakpoint.xlsx"), 
                     skip =3) #skip the top 3 rows of the Excel Sheet 

#clean the 2017 data
wy2017_clean <- wy2017 %>% 
  #change column names
  rename(datetime = "Date/time",
         stream_height_ft = "Stage.ft") %>% 
  #remove unnecessary columns 
  select(c("datetime", "stream_height_ft")) %>% 
  #format column names into lower_snake_case
  clean_names() %>% 
  #create a date column 
  mutate(date = format(as.POSIXct(datetime, format = '%m/%d/%Y %H:%M:%S', 
                                  tz = "GMT"), 
                       format = '%Y-%m-%d'),
         date = as.POSIXct(date, tz = "GMT")) 

#save clean data CSV to use in future RMD files
write.csv(x = wy2017_clean, 
          file = file.path(here("intermediate_data", "wy2017_clean.csv")),
          row.names = FALSE)

#test the 2017 data
if(nrow(wy2017) != nrow(wy2017_clean)) stop("Check clean dataframe dimensions")

#plot 
p_2017 <- ggplot(data = wy2017_clean) + 
  geom_line(aes(x = datetime, y = stream_height_ft)) + 
  theme_classic() + 
  labs(x = "Time", 
       y = "Stream Height (ft)", 
       title = "WY 2017 S2 Bog Weir Stream Height") + 
  theme(plot.title = element_text(hjust = 0.5))

#static plot 
#p_2017

#interactive plot 
ggplotly(p_2017)

Water Year 2018

  • Read in the WY 2018 data from the External-MEF_DATA Box folder
    • Raw data for the S2 Bog site is found at the following file path: External-MEF_DATA/Hydro/Streamflow/L0_subdaily/StripCharts/AnnualBreakPoint/S2
  • Clean and manipulate the WY 2018 data into a tidy format
  • Change stream height values of 0 to missing NA values during the following time periods:
    • December 1-3 clock stopped, no record

    • December 8-10 pin reached end of chart, no record

    • Dec 11-end, clock is barely running. No way to tell which days are what on the chart.

    • November 10-14 clock stopped, no record

    • November 25- December 4 clock stopped, no record Possibly the clock was moving super slowly for some of it?

    • For Oct 9-10 there was no rain for flow to increase

    • Oct 11-15 clock was stuck and the pin did not move forward in time

  • NA values are derived from notes recorded in .txt files found at the following Box filepath: External-MEF_DATA/Hydro/Streamflow/L0_daily/StripCharts/DailyBreakpoint/2018
  • Test that the manipulated data frame has the same number of rows as the original data frame
  • Plot the WY 2018 data
#create file path to call the data from Box 
## Mia's file path 
filepath <- "/Users/miaforsline/Library/CloudStorage/Box-Box/External-MEF_DATA/Hydro/Streamflow/L0_subdaily/StripCharts/AnnualBreakPoint/S2"

#create a function to specify custom time intervals 
`%between%` <- function(x, interval) x >= interval[1] & x <= interval[2]

#read in the 2018 data 
wy2018 <- read_excel(path = here(filepath, "WY2018.S2_Breakpoint.xlsx"), 
                     skip =3)

#clean the 2018 data
wy2018_clean <- wy2018 %>% 
  rename(datetime = "Date/time",
         stream_height_ft = "Stage.ft") %>% 
  select(c("datetime", "stream_height_ft")) %>% 
  clean_names() %>% 
  mutate(date = format(as.POSIXct(datetime, format = '%m/%d/%Y %H:%M:%S', 
                                  tz = "GMT"), 
                       format = '%Y-%m-%d'),
         date = as.POSIXct(date, tz = "GMT")) 

#assign October NA values 
wy2018_clean$stream_height_ft[wy2018_clean$date %between% as.Date(c('2018-10-09', '2018-10-10'))] <- NA
wy2018_clean$stream_height_ft[wy2018_clean$date %between% as.Date(c('2018-10-11', '2018-10-15'))] <- NA

#assign November NA values 
wy2018_clean$stream_height_ft[wy2018_clean$date %between% as.Date(c('2018-11-10', '2018-11-14'))] <- NA
wy2018_clean$stream_height_ft[wy2018_clean$date %between% as.Date(c('2018-11-25', '2018-12-04'))] <- NA

#assign December NA values 
wy2018_clean$stream_height_ft[wy2018_clean$date %between% as.Date(c('2018-12-01', '2018-12-03'))] <- NA
wy2018_clean$stream_height_ft[wy2018_clean$date %between% as.Date(c('2018-12-08', '2018-12-10'))] <- NA
wy2018_clean$stream_height_ft[wy2018_clean$date %between% as.Date(c('2018-12-11', '2018-12-31'))] <- NA

#test the 2018 data
if(nrow(wy2018) != nrow(wy2018_clean)) stop("Check clean dataframe dimensions")

#plot
p_2018_clean <- ggplot(data = wy2018_clean) + 
  geom_line(aes(x = datetime, y = stream_height_ft)) + 
  theme_classic() + 
  labs(x = "Time", 
       y = "Stream Height (ft)", 
       title = "WY 2018 S2 Bog Weir Stream Height") + 
  theme(plot.title = element_text(hjust = 0.5))

#note the weird oscillating values from Oct 31 - Nov 1
#static plot
#p_2018_clean

#interactive plot 
ggplotly(p_2018_clean)

Next, we need to remove unusual stream height values from Oct 31 - Nov 1, 2018. During this time period, the stream height values oscillated rapidly, likely indicating human error in recording the data. Thus, we need to remove the erroneous values to smooth out the line. For more details see the next code chunk below.

wy2018_altered <- wy2018_clean

#for now, remove erroneous values < 0.10 during the time period of interest 
wy2018_altered <- subset(wy2018_altered, !(wy2018_altered$date %between% as.Date(c('2018-10-31', '2018-11-01')) & wy2018_altered$stream_height_ft < 0.10))

#test the 2018 altered data - there should be fewer rows in the altered dataframe than the original/cleaned dataframe 
if(nrow(wy2018_altered) == nrow(wy2018_clean)) stop("Check altered dataframe dimensions")

#plot
p_2018_altered <- ggplot(data = wy2018_altered) + 
  geom_line(aes(x = datetime, y = stream_height_ft)) + 
  theme_classic() + 
  labs(x = "Time", 
       y = "Stream Height (ft)", 
       title = "WY 2018 S2 Bog Stream Height") + 
  theme(plot.title = element_text(hjust = 0.5))

#save clean data CSV to use in future RMD files
write.csv(x = wy2018_altered, 
          file = file.path(here("intermediate_data", "wy2018_altered.csv")),
          row.names = FALSE)

#note that the wonky values have been removed and the graphis now smoothed out 
#static plot
#p_2018_altered

#interactive plot 
ggplotly(p_2018_altered)

Altered Oct 31 - Nov 1, 2018 Streamflow Data

During this time, it is most likely that the dates were read/input incorrectly, resulting in an unnatural oscillation of stream flow values on Oct 31 and Nov 1 of 2018. Therefore, we removed values < 0.10 during this time period to smooth out the overall curve of stream height values.

#subset the time period of interest
subset_2018 <- wy2018_clean %>% 
  subset(date %between% as.Date(c('2018-10-30', '2018-11-02'))) 

#plot the data 
p_subset <- ggplot(data = subset_2018) + 
  geom_line(aes(x = datetime, y = stream_height_ft)) + 
  labs(title = "Wonky Oct 31 - Nov 1 Streamflow Values")

#static plot 
p_subset

#interactive plot
#ggplotly(p_subset)

#remove unusual values below 0.10 
subset_2018_altered <- wy2018_clean %>% 
  subset(date %between% as.Date(c('2018-10-30', '2018-11-02'))) %>% 
  subset(stream_height_ft > 0.10)

#plot the altered data
p_subset_altered <- ggplot(data = subset_2018_altered) + 
  geom_line(aes(x = datetime, y = stream_height_ft)) + 
  labs(title = "Fixed Oct 31 - Nov 1 Streamflow Values")

#static plot
p_subset_altered

#interactive plot
#ggplotly(p_subset_altered)

Water Year 2019

  • Read in the WY 2019 data from the External-MEF_DATA Box folder
    • Raw data for the S2 Bog site is found at the following file path: External-MEF_DATA/Hydro/Streamflow/L0_subdaily/StripCharts/AnnualBreakPoint/S2
  • Clean and manipulate the WY 2019 data into a tidy format
  • Change values of 0 to missing NA values during the following time periods:
    • April 1-2, Line and time is off

    • April 8-17. Line doesn’t fluctuate though there should’ve been rain or if not rain, it should’ve fallen from how high the flow is

    • Proven in that the adjustment for 4/15 flow is great and S6 Weir chart response. Therefore, no record

    • April 17-22 discounted because pen level is off with no way to account for it

    • February: Clock errors, record compromised. Flow is below zero for month Clock restarted 2/21/2019. Measurements still below zero.

    • January: Clock errors meant that time could not be used to record accurate measurements. Measurements tend to be around 0.001 and chart line is there or below zero. Physical measurements indicate flow until at least 1/15/2019

    • March 14, Pen reverses and goes off the chart

    • March 18th, Pen placed back on chart, still reversed (fixed 4/23)

    • March 30-31 Pen stays in place even though there should be a response. Likely because it reached the bottom of the chart (pen was reversed)

    • September: Chart did not run from 9/3-9/10

  • NA values are derived from notes recorded in .txt files found at the following Box filepath: External-MEF_DATA/Hydro/Streamflow/L0_daily/StripCharts/DailyBreakpoint/2019/digitized/S2-19/
  • Test the data
  • Plot the WY 2019 data
#create file path to call the data from Box 
## Mia's file path 
filepath <- "/Users/miaforsline/Library/CloudStorage/Box-Box/External-MEF_DATA/Hydro/Streamflow/L0_subdaily/StripCharts/AnnualBreakPoint/S2"

#read in the 2019 data 
wy2019 <- read_excel(path = here(filepath, "WY2019.S2_Breakpoint.xlsx"), 
                     skip =3)

#clean the 2019 data
wy2019_clean <- wy2019 %>% 
  rename(datetime = "Date/time",
         stream_height_ft = "Stage.ft") %>% 
  select(c("datetime", "stream_height_ft")) %>% 
  clean_names() %>% 
  mutate(date = format(as.POSIXct(datetime, format = '%m/%d/%Y %H:%M:%S', 
                                  tz = "GMT"), 
                       format = '%Y-%m-%d'),
         date = as.POSIXct(date, tz = "GMT")) 

#assign February NA values 
wy2019_clean$stream_height_ft[wy2019_clean$date %between% as.Date(c('2019-04-01', '2019-04-02'))] <- NA

#assign March NA values 
wy2019_clean$stream_height_ft[wy2019_clean$date == as.Date('2019-03-14')] <- NA
wy2019_clean$stream_height_ft[wy2019_clean$date == as.Date('2019-03-18')] <- NA
wy2019_clean$stream_height_ft[wy2019_clean$date %between% as.Date(c('2019-03-30', '2019-03-31'))] <- NA

#assign April NA values 
wy2019_clean$stream_height_ft[wy2019_clean$date %between% as.Date(c('2019-04-01', '2019-04-02'))] <- NA
wy2019_clean$stream_height_ft[wy2019_clean$date %between% as.Date(c('2019-04-08', '2019-04-17'))] <- NA
wy2019_clean$stream_height_ft[wy2019_clean$date %between% as.Date(c('2019-04-17', '2019-04-22'))] <- NA

#assign September NA values 
wy2019_clean$stream_height_ft[wy2019_clean$date %between% as.Date(c('2019-09-03', '2019-09-10'))] <- NA

#save clean data CSV to use in future RMD files
write.csv(x = wy2019_clean, 
          file = file.path(here("intermediate_data", "wy2019_clean.csv")),
          row.names = FALSE)

#test the 2019 data 
if(nrow(wy2019) != nrow(wy2019_clean)) stop("Check clean dataframe dimensions")

#plot 
p_2019 <- ggplot(data = wy2019_clean) + 
  geom_line(aes(x = datetime, y = stream_height_ft)) + 
  theme_classic() + 
  labs(x = "Time", 
       y = "Stream Height (Ft)",
       title = "WY 2019 S2 Bog Stream Height") + 
  theme(plot.title = element_text(hjust = 0.5))

#static plot
#p_2019

#interactive plot
ggplotly(p_2019)

Water Year 2020

  • Read in the WY 2020 data from the External-MEF_DATA Box folder
    • Raw data for the S2 Bog site is found at the following file path: External-MEF_DATA/Hydro/Streamflow/L0_subdaily/StripCharts/AnnualBreakPoint/S2
  • Clean and manipulate the WY 2020 data into a tidy format
  • Change values of 0 to missing NA values during the following time periods:
    • These gaps are not 0 flow (including the begin and end dates given), and should be replaced with NA unless Jake can digitize them: 7/28/2020 13:40:00 through 8/11/2020 00:00:00 (chart needs correction)
  • NA values are derived from notes recorded in .txt files found at the following Box filepath: External-MEF_DATA/Hydro/Streamflow/L0_daily/StripCharts/DailyBreakpoint/2020/digitized/S2-20/wy2020_s2_gaps.txt
  • Test the data
  • Plot the WY 2020 data
#create file path to call the data from Box 
## Mia's file path 
filepath <- "/Users/miaforsline/Library/CloudStorage/Box-Box/External-MEF_DATA/Hydro/Streamflow/L0_subdaily/StripCharts/AnnualBreakPoint/S2"

#read in the 2020 data 
wy2020 <- read_excel(path = here(filepath, "WY2020.S2_Breakpoint.xlsx"), 
                     skip =3)

#clean the 2020 data
wy2020_clean <- wy2020 %>% 
  rename(datetime = "Date/time",
         stream_height_ft = "Stage.ft") %>% 
  select(c("datetime", "stream_height_ft")) %>% 
  clean_names() %>% 
  mutate(date = format(as.POSIXct(datetime, format = '%m/%d/%Y %H:%M:%S', 
                                  tz = "GMT"), 
                       format = '%Y-%m-%d'),
         date = as.POSIXct(date, tz = "GMT")) 

#create date range to fill in with NA values 
start_date <- as.POSIXct("7/28/2020 13:40:00", 
                         format = '%m/%d/%Y %H:%M:%S',
                         tz = "GMT")
end_date <- as.POSIXct("8/11/2020 00:00:00", 
                         format = '%m/%d/%Y %H:%M:%S',
                         tz = "GMT")

#create a function to specify custom time intervals 
`%between%` <- function(x, interval) x >= interval[1] & x <= interval[2]

#assign February NA values 
wy2020_clean$stream_height_ft[wy2020_clean$datetime %between% c(start_date, end_date)] <- NA

#save clean data CSV to use in future RMD files
write.csv(x = wy2020_clean, 
          file = file.path(here("intermediate_data", "wy2020_clean.csv")),
          row.names = FALSE)

#test the 2020 data 
if(nrow(wy2020) != nrow(wy2020_clean)) stop("Check clean dataframe dimensions")

#plot 
p_2020 <- ggplot(data = wy2020_clean) + 
  geom_line(aes(x = datetime, y = stream_height_ft)) + 
  theme_classic() + 
  labs(x = "Time", 
       y = "Stream Height (Ft)",
       title = "WY 2020 S2 Bog Stream Height") + 
  theme(plot.title = element_text(hjust = 0.5))

#static plot
#p_2020

#interactive plot
ggplotly(p_2020)

Water Year 2021

  • Read in the WY 2021 data from the External-MEF_DATA Box folder
    • Raw data for the S2 Bog site is found at the following file path: External-MEF_DATA/Hydro/Streamflow/L0_subdaily/StripCharts/AnnualBreakPoint/S2
  • Clean and manipulate the WY 2021 data into a tidy format
  • Change values of 0 to missing NA values during the following time periods:
    • These gaps are not 0 flow (including the begin and end dates given), and should be replaced with NA unless Jake can digitize them:
      • 2021-11-11 23:59:00 through 2021-12-29 00:00:00 (clock running fast on chart then clock died, well froze. Datalogger will have most of the data and is correct, except shaft encoder float froze at .14, see scanned chart for more remarks)

      • and also (beginning values off, should be .18 float frozen for several hours) 2021-03-04 00:00:00 through 2021-03-09 00:00:00 (need to correct for start of flow. See datalogger data)

  • NA values are derived from notes recorded in .txt files found at the following Box filepath: External-MEF_DATA/Hydro/Streamflow/L0_daily/StripCharts/DailyBreakpoint/2021/digitized/S2-21/wy2021_s2_gaps.txt
  • Test the data
  • Plot the WY 2021 data
#create file path to call the data from Box 
## Mia's file path 
filepath <- "/Users/miaforsline/Library/CloudStorage/Box-Box/External-MEF_DATA/Hydro/Streamflow/L0_subdaily/StripCharts/AnnualBreakPoint/S2"

#read in the 2021 data 
wy2021 <- read_excel(path = here(filepath, "WY2021.S2_Breakpoint.xlsx"), 
                     skip =3)

#clean the 2021 data
wy2021_clean <- wy2021 %>% 
  rename(datetime = "Date/time",
         stream_height_ft = "Stage.ft") %>% 
  select(c("datetime", "stream_height_ft")) %>% 
  clean_names() %>% 
  mutate(date = format(as.POSIXct(datetime, format = '%m/%d/%Y %H:%M:%S', 
                                  tz = "GMT"), 
                       format = '%Y-%m-%d'),
         date = as.POSIXct(date, tz = "GMT")) 

#identify date intervals that need to be filled w/NAs
date1 <- as.POSIXct("2021-03-04 00:00:00", 
                         format = '%Y-%m-%d %H:%M:%S',
                         tz = "GMT")
date2 <- as.POSIXct("2021-03-09 00:00:00", 
                         format = '%Y-%m-%d %H:%M:%S',
                         tz = "GMT")
date3 <- as.POSIXct("2021-11-11 23:59:00", 
                         format = '%Y-%m-%d %H:%M:%S',
                         tz = "GMT")
date4 <- as.POSIXct("2021-12-29 00:00:00", 
                         format = '%Y-%m-%d %H:%M:%S',
                         tz = "GMT")


#create a function to specify custom time intervals 
`%between%` <- function(x, interval) x >= interval[1] & x <= interval[2]

#assign March NA values
wy2021_clean$stream_height_ft[wy2021_clean$datetime %between% c(date1, date2)] <- NA

#assign Nov-Dec NA values 
wy2021_clean$stream_height_ft[wy2021_clean$datetime %between% c(date3, date4)] <- NA

#save clean data CSV to use in future RMD files
write.csv(x = wy2021_clean, 
          file = file.path(here("intermediate_data", "wy2021_clean.csv")),
          row.names = FALSE)

#test the 2021 data 
if(nrow(wy2021) != nrow(wy2021_clean)) stop("Check clean dataframe dimensions")

#plot 
p_2021 <- ggplot(data = wy2021_clean) + 
  geom_line(aes(x = datetime, y = stream_height_ft)) + 
  theme_classic() + 
  labs(x = "Time", 
       y = "Stream Height (Ft)",
       title = "WY 2021 S2 Bog Stream Height") + 
  theme(plot.title = element_text(hjust = 0.5))

#static plot
#p_2021

#interactive plot
ggplotly(p_2021)

All Water Years

#combine 2017 - 2021 data
all_streamflow <- rbind(wy2017_clean, 
                        wy2018_altered, 
                        wy2019_clean,
                        wy2020_clean,
                        wy2021_clean)

#save clean data CSV to use in future RMD files
write.csv(x = all_streamflow, 
          file = file.path(here("intermediate_data", "all_streamflow.csv")),
          row.names = FALSE)

#plot 
p_all <- ggplot(data = all_streamflow) + 
  geom_line(aes(x = datetime, y = stream_height_ft)) + 
  theme_classic() + 
  labs(x = "Time", 
       y = "Stream Height (Ft)",
       title = "S2 Bog Stream Height (WY 2017 - 2021)") + 
  theme(plot.title = element_text(hjust = 0.5))

#static plot
#p_all

#interactive plot
ggplotly(p_all)